Calculate/load Differential Expression metrics for all genes, load SFARI dataset

load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345') & !is.na(datMeta$Dx)
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
  
if(!file.exists('./../working_data/genes_ASD_DE_info.csv')) {
  
  # Calculate differential expression for ASD
  mod = model.matrix(~ Dx, data=datMeta)
  corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
  lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
  
  fit = eBayes(lmfit, trend=T, robust=T)
  top_genes = topTable(fit, coef=2, number=nrow(datExpr))
  genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
                  mutate('ID'=rownames(datExpr))
  
  write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info.csv')
  
  rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info.csv')
}

SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
              mutate('ID' = ensembl_gene_id) %>%
              dplyr::select(ID, `gene-score`, syndromic)

rm(to_keep, datSeq, datProbes)

Experiment 1: Changes in PCA plots for different filtering thresholds

lfc=-1 means no filtering at all, the rest of the filterings include an adjusted p-value lower than 0.05

Note: PC values get smaller as Log2 fold change increases, so on each iterations the values were scaled so it would be easier to compare between frames

lfc_list = c(seq(0, 1.4, 0.1), seq(1.45, 2, 0.05))

n_genes = nrow(datExpr)

# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)

# Initialice DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=rownames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps
pca_genes_old = pcas_genes

for(lfc in lfc_list){
  
  # Filter DE genes with iteration's criteria
  DE_genes = genes_DE_info %>% filter(adj.P.Val<0.05 & abs(logFC)>lfc)
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
  pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=DE_genes$ID, 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))  
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  if(cor(pca_genes_new$PC1, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1 )<0){
    pca_genes_new$PC1 = -pca_genes_new$PC1
  }
  if(cor(pca_genes_new$PC2, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
    pca_genes_new$PC2 = -pca_genes_new$PC2
  }
  
  pca_samps_old = pca_samps_new
  pca_genes_old = pca_genes_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  pcas_genes = rbind(pcas_genes, pca_genes_new)
  
}

# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
             dplyr::select(ID, PC1, PC2, lfc, Diagnosis_)
pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>% 
             mutate('score'=as.factor(`gene-score`)) %>%
             dplyr::select(ID, PC1, PC2, lfc, score)

# Plot change of number of genes
data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
  geom_point() + geom_line() + theme_minimal() + 
  ggtitle('Number of remaining genes when modifying filtering threshold')

rm(datExpr_pca_genes, datExpr_pca_samps, DE_genes, datExpr_DE, pca_genes_new, pca_samps_new, 
   pca_genes_old, pca_samps_old, lfc_list, lfc)

Samples

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=lfc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))

SFARI genes by score

pcas_sfari_genes = pcas_genes %>% filter(!is.na(score)) %>% dplyr::select(-'score')

complete_sfari_df = expand.grid(unique(pcas_sfari_genes$ID), unique(pcas_sfari_genes$lfc))
colnames(complete_sfari_df) = c('ID', 'lfc')

pcas_sfari_genes = pcas_sfari_genes %>% right_join(complete_sfari_df, by=c('ID','lfc')) %>% 
                   left_join(SFARI_genes, by='ID') %>% 
                   mutate('score'=as.factor(`gene-score`), 'syndromic'=as.factor(syndromic))
pcas_sfari_genes[is.na(pcas_sfari_genes)] = 0
  
ggplotly(pcas_sfari_genes %>% ggplot(aes(PC1, PC2, color=score)) + 
         geom_point(aes(frame=lfc, ids=ID), alpha=0.6) + theme_minimal() + 
         ggtitle('Genes PCA plot modifying filtering threshold'))

SFARI genes by syndromic

ggplotly(pcas_sfari_genes %>% ggplot(aes(PC1, PC2, color=syndromic)) + 
         geom_point(aes(frame=lfc, ids=ID), alpha=0.6) + theme_minimal() + 
         ggtitle('Genes PCA plot modifying filtering threshold'))

All genes

ggplotly(pcas_genes %>% ggplot(aes(PC1, PC2)) + geom_point(aes(frame=lfc, ids=ID, alpha=0.5)) + 
         theme_minimal() + ggtitle('Genes PCA plot modifying filtering threshold'))